1. Read the CSV (hard‑coded folder)

library(tidyverse)

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  fig.width = 14,
  fig.height = 8,
  out.width = "100%",
  dpi = 150
)

# ---- User config ----
folder <- "C:/Users/dunnmk/Downloads/PD_Data/PD_Data"  # CHANGE THIS PATH
time_ref <- 0
time_cmp <- 180

# Plot toggles
trans_combos_to_plot <- c("A_to_D", "C_to_B")
show_extra_fc_plots <- FALSE
dsb_for_cis_correlation <- "DSB1"

# Optional filters (set any to NULL to include everything)
filters <- list(
  batch = NULL,
  time_point = NULL,
  DSB = NULL,
  combo = NULL
)

# ---- Small helpers ----
pct <- function(num, den) dplyr::if_else(den > 0, 100 * num / den, NA_real_)
fc_ratio <- function(num, den, eps = 1e-6) {
  # Avoid misleading 1.0 fold-changes when both numerator and denominator are truly 0.
  # (Previously, eps/eps = 1.0.)
  dplyr::case_when(
    is.na(num) | is.na(den) ~ NA_real_,
    abs(num) < eps & abs(den) < eps ~ NA_real_,
    TRUE ~ (num + eps) / (den + eps)
  )
}

apply_filters <- function(df, filters = list()) {
  if (!is.null(filters$batch) && "batch" %in% names(df)) {
    df <- df %>% dplyr::filter(batch %in% filters$batch)
  }
  if (!is.null(filters$time_point) && "time_point" %in% names(df)) {
    df <- df %>% dplyr::filter(time_point %in% filters$time_point)
  }
  if (!is.null(filters$DSB) && "DSB" %in% names(df)) {
    df <- df %>% dplyr::filter(DSB %in% filters$DSB)
  }
  if (!is.null(filters$combo) && "combo" %in% names(df)) {
    df <- df %>% dplyr::filter(combo %in% filters$combo)
  }
  df
}

theme_allele_x <- function() {
  theme_bw(base_size = 14) +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10),
      axis.text.y = element_text(size = 11),
      strip.text = element_text(size = 13),
      plot.title = element_text(size = 16, face = "bold"),
      plot.margin = margin(5.5, 15, 5.5, 5.5)
    )
}

stopifnot(dir.exists(folder))
files <- list.files(folder, pattern = "_summary\\.csv$", full.names = TRUE)
stopifnot(length(files) > 0)

raw <- purrr::map_dfr(files, readr::read_csv, show_col_types = FALSE)

# Standardize time column name
if ("time" %in% names(raw) && !"time_point" %in% names(raw)) {
  raw <- raw %>% dplyr::rename(time_point = time)
}

raw <- raw %>% dplyr::mutate(time_point = as.numeric(time_point))

filtered <- apply_filters(raw, filters)

filtered |> head(10) |> knitr::kable(caption = "Combined CSV preview (after optional filters)")
Combined CSV preview (after optional filters)
alignment_name cis_trans DSB combo repeat allele count batch time_point
CIS_DSB1_FULL_CHRIII_L02 CIS DSB1 A_to_B FULL CHRIII_L02 51283 batch6 0
CIS_DSB1_FULL_CHRIII_L03 CIS DSB1 A_to_B FULL CHRIII_L03 38174 batch6 0
CIS_DSB1_FULL_CHRIII_L04 CIS DSB1 A_to_B FULL CHRIII_L04 228463 batch6 0
CIS_DSB1_FULL_CHRIV_L01 CIS DSB1 A_to_B FULL CHRIV_L01 49541 batch6 0
CIS_DSB1_FULL_CHRIV_L03 CIS DSB1 A_to_B FULL CHRIV_L03 32857 batch6 0
CIS_DSB1_FULL_CHRIV_L06_2 CIS DSB1 A_to_B FULL CHRIV_L06_2 54088 batch6 0
CIS_DSB1_FULL_CHRIV_L10_2 CIS DSB1 A_to_B FULL CHRIV_L10_2 39551 batch6 0
CIS_DSB1_FULL_CHRIV_L12_2 CIS DSB1 A_to_B FULL CHRIV_L12_2 44777 batch6 0
CIS_DSB1_FULL_CHRIV_L13 CIS DSB1 A_to_B FULL CHRIV_L13 29708 batch6 0
CIS_DSB1_FULL_CHRIV_L15_2 CIS DSB1 A_to_B FULL CHRIV_L15_2 51410 batch6 0

  1. Aggregate counts and percentages

Aggregate counts from DSB monitoring pulldown experiment


summarize_counts_pct <- function(df) {
  df %>%
    summarise(
      Total_Counts   = sum(count),
      Cis_Counts     = sum(count[cis_trans == "CIS"]),
      Trans_Counts   = sum(count[cis_trans == "TRANS"]),
      A_to_B_Counts  = sum(count[combo == "A_to_B"]),
      C_to_D_Counts  = sum(count[combo == "C_to_D"]),
      A_to_D_Counts  = sum(count[combo == "A_to_D"]),
      C_to_B_Counts  = sum(count[combo == "C_to_B"]),
      Percent_Cis    = pct(Cis_Counts, Total_Counts),
      Percent_Trans  = pct(Trans_Counts, Total_Counts),
      Percent_A_to_B = pct(A_to_B_Counts, Total_Counts),
      Percent_C_to_D = pct(C_to_D_Counts, Total_Counts),
      Percent_A_to_D = pct(A_to_D_Counts, Total_Counts),
      Percent_C_to_B = pct(C_to_B_Counts, Total_Counts),
      .groups = "drop"
    )
}

agg_counts <- filtered %>%
  group_by(batch, time_point, DSB) %>%
  summarize_counts_pct()

agg_counts |> knitr::kable(caption = "Aggregated counts and percentages")
Aggregated counts and percentages
batch time_point DSB Total_Counts Cis_Counts Trans_Counts A_to_B_Counts C_to_D_Counts A_to_D_Counts C_to_B_Counts Percent_Cis Percent_Trans Percent_A_to_B Percent_C_to_D Percent_A_to_D Percent_C_to_B
batch6 0 DSB1 1238149 1238149 0 1238149 0 0 0 100 0 100 0 0.00000 0.00000
batch6 0 DSB2 1777638 1777638 0 0 1777638 0 0 100 0 0 100 0.00000 0.00000
batch6 0 TRANS 7447 0 7447 0 0 4588 2859 0 100 0 0 61.60870 38.39130
batch6 180 DSB1 1549523 1549523 0 1549523 0 0 0 100 0 100 0 0.00000 0.00000
batch6 180 DSB2 1759385 1759385 0 0 1759385 0 0 100 0 0 100 0.00000 0.00000
batch6 180 TRANS 230252 0 230252 0 0 162974 67278 0 100 0 0 70.78071 29.21929
batch8 0 DSB1 483724 483724 0 483724 0 0 0 100 0 100 0 0.00000 0.00000
batch8 0 DSB2 1057475 1057475 0 0 1057475 0 0 100 0 0 100 0.00000 0.00000
batch8 0 TRANS 1284 0 1284 0 0 749 535 0 100 0 0 58.33333 41.66667
batch8 180 DSB1 1511971 1511971 0 1511971 0 0 0 100 0 100 0 0.00000 0.00000
batch8 180 DSB2 1726168 1726168 0 0 1726168 0 0 100 0 0 100 0.00000 0.00000
batch8 180 TRANS 254293 0 254293 0 0 174852 79441 0 100 0 0 68.76005 31.23995

2.5. Batch-level QC plots (counts and CIS/TRANS composition)

These plots are intended to sanity-check overall depth and composition before allele-level cis/trans normalization.

# Re-use the same aggregation logic as the table above
summarize_qc_by_batch <- function(df) {
  df %>%
    group_by(batch, time_point, DSB) %>%
    summarize_counts_pct()
}

add_allbatch_shares <- function(agg_qc) {
  totals <- agg_qc %>%
    group_by(time_point) %>%
    summarise(
      Total_AllBatches = sum(Total_Counts),
      Total_Cis_AllBatches = sum(Cis_Counts),
      Total_Trans_AllBatches = sum(Trans_Counts),
      .groups = "drop"
    )

  agg_qc %>%
    left_join(totals, by = "time_point") %>%
    mutate(
      Percent_Total_of_AllBatches = pct(Total_Counts, Total_AllBatches),
      Percent_Cis_of_AllBatches = pct(Cis_Counts, Total_Cis_AllBatches),
      Percent_Trans_of_AllBatches = pct(Trans_Counts, Total_Trans_AllBatches)
    )
}

# Main QC table
agg_qc <- summarize_qc_by_batch(filtered)

# Percent-of-total across ALL batches (within each time point).
# These percentages are intended to SUM TO 100 across batches (and DSB bars) per time point.
agg_qc_allbatches <- add_allbatch_shares(agg_qc)

# ---- 1) Total counts by batch ----
p_total_counts <- ggplot(agg_qc, aes(x = batch, y = Total_Counts, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.75) +
  geom_text(
    aes(label = scales::comma(Total_Counts)),
    position = position_dodge(width = 0.8),
    vjust = -0.25,
    size = 3.4
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Total counts by batch (faceted by time)",
    x = "Batch",
    y = "Total counts"
  )

if (nrow(agg_qc) > 0) {
  print(p_total_counts)
}

# Optional: total-count share across ALL batches (sums to 100 within each time point)
p_total_counts_share <- ggplot(agg_qc_allbatches, aes(x = batch, y = Percent_Total_of_AllBatches, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent_Total_of_AllBatches), NA_character_, paste0(round(Percent_Total_of_AllBatches, 1), "%"))),
    position = position_dodge(width = 0.8),
    vjust = -0.25,
    size = 3.2
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Total-count share by batch (of ALL batches) — bars sum to 100",
    x = "Batch",
    y = "% of total counts (across all batches)",
    fill = "DSB"
  )

if (nrow(agg_qc_allbatches) > 0 && any(!is.na(agg_qc_allbatches$Percent_Total_of_AllBatches))) {
  print(p_total_counts_share)
}

# ---- 2) Counts by category within each batch (split CIS-only vs TRANS-only) ----
cis_counts_long <- agg_qc %>%
  select(
    batch, time_point, DSB,
    Cis_Counts,
    A_to_B_Counts, C_to_D_Counts
  ) %>%
  pivot_longer(
    cols = -c(batch, time_point, DSB),
    names_to = "Metric",
    values_to = "Counts"
  ) %>%
  mutate(
    Metric = factor(
      Metric,
      levels = c("Cis_Counts", "A_to_B_Counts", "C_to_D_Counts"),
      labels = c("CIS (total)", "A→B (cis)", "C→D (cis)")
    )
  )

p_cis_counts_by_type <- ggplot(cis_counts_long, aes(x = batch, y = Counts, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = scales::comma(Counts)),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 3.0
  ) +
  facet_grid(Metric ~ time_point, scales = "free_y") +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "CIS counts by category per batch — all DSBs on one graph",
    x = "Batch",
    y = "Counts",
    fill = "DSB"
  )

if (nrow(cis_counts_long) > 0) {
  print(p_cis_counts_by_type)
}

trans_counts_long <- agg_qc %>%
  select(
    batch, time_point, DSB,
    Trans_Counts,
    A_to_D_Counts, C_to_B_Counts
  ) %>%
  pivot_longer(
    cols = -c(batch, time_point, DSB),
    names_to = "Metric",
    values_to = "Counts"
  ) %>%
  mutate(
    Metric = factor(
      Metric,
      levels = c("Trans_Counts", "A_to_D_Counts", "C_to_B_Counts"),
      labels = c("TRANS (total)", "A→D (trans)", "C→B (trans)")
    )
  )

p_trans_counts_by_type <- ggplot(trans_counts_long, aes(x = batch, y = Counts, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = scales::comma(Counts)),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 3.0
  ) +
  facet_grid(Metric ~ time_point, scales = "free_y") +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "TRANS counts by category per batch — all DSBs on one graph",
    x = "Batch",
    y = "Counts",
    fill = "DSB"
  )

if (nrow(trans_counts_long) > 0) {
  print(p_trans_counts_by_type)
}

# ---- 3) Percent CIS and Percent TRANS per group (split) ----
pct_cis_only <- agg_qc_allbatches %>%
  select(batch, time_point, DSB, Percent_Cis_of_AllBatches) %>%
  mutate(Percent_Cis_Label = if_else(is.na(Percent_Cis_of_AllBatches), NA_character_, paste0(round(Percent_Cis_of_AllBatches, 1), "%")))

p_pct_cis <- ggplot(pct_cis_only, aes(x = batch, y = Percent_Cis_of_AllBatches, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(aes(label = Percent_Cis_Label), position = position_dodge(width = 0.85), vjust = -0.25, size = 3.2) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "CIS share by batch (of ALL batches) — bars sum to 100",
    x = "Batch",
    y = "% of CIS counts (across all batches)",
    fill = "DSB"
  )

if (nrow(pct_cis_only) > 0 && any(!is.na(pct_cis_only$Percent_Cis_of_AllBatches))) {
  print(p_pct_cis)
}

pct_trans_only <- agg_qc_allbatches %>%
  select(batch, time_point, DSB, Percent_Trans_of_AllBatches) %>%
  mutate(Percent_Trans_Label = if_else(is.na(Percent_Trans_of_AllBatches), NA_character_, paste0(round(Percent_Trans_of_AllBatches, 1), "%")))

p_pct_trans <- ggplot(pct_trans_only, aes(x = batch, y = Percent_Trans_of_AllBatches, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(aes(label = Percent_Trans_Label), position = position_dodge(width = 0.85), vjust = -0.25, size = 3.2) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "TRANS share by batch (of ALL batches) — bars sum to 100",
    x = "Batch",
    y = "% of TRANS counts (across all batches)",
    fill = "DSB"
  )

if (nrow(pct_trans_only) > 0 && any(!is.na(pct_trans_only$Percent_Trans_of_AllBatches))) {
  print(p_pct_trans)
}

# ---- 4) CIS/TRANS combo share (of ALL batches) ----
# These plots are computed as % of ALL CIS (or ALL TRANS) counts across batches,
# so the bars/segments within each time point SUM TO 100.
summarize_combo_counts <- function(df) {
  df %>%
    group_by(batch, time_point, DSB, cis_trans, combo) %>%
    summarise(Combo_Counts = sum(count), .groups = "drop")
}

combo_counts <- summarize_combo_counts(filtered) %>%
  mutate(DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")))

# CIS: only A_to_B and C_to_D
cis_combo_counts <- combo_counts %>%
  filter(cis_trans == "CIS", combo %in% c("A_to_B", "C_to_D"))

cis_totals <- cis_combo_counts %>%
  group_by(time_point) %>%
  summarise(Total_Cis_2Combo_AllBatches = sum(Combo_Counts), .groups = "drop")

cis_combo_share <- cis_combo_counts %>%
  left_join(cis_totals, by = "time_point") %>%
  mutate(
    Percent = pct(Combo_Counts, Total_Cis_2Combo_AllBatches),
    Combo = dplyr::recode(combo, A_to_B = "A→B (CIS)", C_to_D = "C→D (CIS)")
  )

p_cis_combo_share <- ggplot(
  cis_combo_share,
  aes(x = interaction(batch, DSB, sep = " | "), y = Percent, fill = Combo)
) +
  geom_col(width = 0.8) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 3.0
  ) +
  facet_wrap(~ time_point) +
  scale_x_discrete(labels = function(x) sub(" \\|.*$", "", x)) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "CIS combo share (A→B + C→D) of ALL batches — bars sum to 100",
    x = "Batch",
    y = "% of CIS counts (across all batches)",
    fill = ""
  )

if (nrow(cis_combo_share) > 0 && any(!is.na(cis_combo_share$Percent))) {
  print(p_cis_combo_share)
}

# TRANS: only A_to_D and C_to_B
trans_combo_counts <- combo_counts %>%
  filter(cis_trans == "TRANS", combo %in% c("A_to_D", "C_to_B"))

trans_totals <- trans_combo_counts %>%
  group_by(time_point) %>%
  summarise(Total_Trans_2Combo_AllBatches = sum(Combo_Counts), .groups = "drop")

trans_combo_share <- trans_combo_counts %>%
  left_join(trans_totals, by = "time_point") %>%
  mutate(
    Percent = pct(Combo_Counts, Total_Trans_2Combo_AllBatches),
    Combo = dplyr::recode(combo, A_to_D = "A→D (TRANS)", C_to_B = "C→B (TRANS)")
  )

p_trans_combo_share <- ggplot(
  trans_combo_share,
  aes(x = interaction(batch, DSB, sep = " | "), y = Percent, fill = Combo)
) +
  geom_col(width = 0.8) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 3.0
  ) +
  facet_wrap(~ time_point) +
  scale_x_discrete(labels = function(x) sub(" \\|.*$", "", x)) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "TRANS combo share (A→D + C→B) of ALL batches — bars sum to 100",
    x = "Batch",
    y = "% of TRANS counts (across all batches)",
    fill = ""
  )

if (nrow(trans_combo_share) > 0 && any(!is.na(trans_combo_share$Percent))) {
  print(p_trans_combo_share)
}

*** 3. Cis/trans normalization — per allele


location_pct_by_allele <- function(df) {
  by_allele <- df %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Cis_Location_Counts   = sum(count[cis_trans == "CIS"]),
      Trans_Location_Counts = sum(count[cis_trans == "TRANS"]),
      .groups = "drop"
    )

  totals <- by_allele %>%
    group_by(batch, time_point, DSB) %>%
    summarise(
      Total_Cis_Location_Counts   = sum(Cis_Location_Counts),
      Total_Trans_Location_Counts = sum(Trans_Location_Counts),
      .groups = "drop"
    )

  by_allele %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(
      Total_Group_Counts = Total_Cis_Location_Counts + Total_Trans_Location_Counts,

      # Percent definitions requested:
      #   CIS%  = cis_counts / total_counts (within the group)
      #   TRANS% = trans_counts / total_counts (within the group)
      Percent_Cis_of_GroupTotal = pct(Cis_Location_Counts, Total_Group_Counts),
      Percent_Trans_of_GroupTotal = pct(Trans_Location_Counts, Total_Group_Counts),

      Percent_Location_in_Cis   = pct(Cis_Location_Counts, Total_Cis_Location_Counts),
      Percent_Location_in_Trans = pct(Trans_Location_Counts, Total_Trans_Location_Counts)
    )
}

dat_norm <- location_pct_by_allele(filtered)
dat_norm |> head(10) |> knitr::kable(caption = "Data after cis/trans normalization")
Data after cis/trans normalization
batch time_point DSB allele Cis_Location_Counts Trans_Location_Counts Total_Cis_Location_Counts Total_Trans_Location_Counts Total_Group_Counts Percent_Cis_of_GroupTotal Percent_Trans_of_GroupTotal Percent_Location_in_Cis Percent_Location_in_Trans
batch6 0 DSB1 CHRIII_L02 52550 0 1238149 0 1238149 4.244239 0 4.244239 NA
batch6 0 DSB1 CHRIII_L03 39051 0 1238149 0 1238149 3.153982 0 3.153982 NA
batch6 0 DSB1 CHRIII_L04 234459 0 1238149 0 1238149 18.936251 0 18.936251 NA
batch6 0 DSB1 CHRIV_L01 50897 0 1238149 0 1238149 4.110733 0 4.110733 NA
batch6 0 DSB1 CHRIV_L03 33625 0 1238149 0 1238149 2.715747 0 2.715747 NA
batch6 0 DSB1 CHRIV_L06_2 55461 0 1238149 0 1238149 4.479348 0 4.479348 NA
batch6 0 DSB1 CHRIV_L10_2 40604 0 1238149 0 1238149 3.279411 0 3.279411 NA
batch6 0 DSB1 CHRIV_L12_2 45904 0 1238149 0 1238149 3.707470 0 3.707470 NA
batch6 0 DSB1 CHRIV_L13 30438 0 1238149 0 1238149 2.458347 0 2.458347 NA
batch6 0 DSB1 CHRIV_L15_2 52669 0 1238149 0 1238149 4.253850 0 4.253850 NA

  1. Allele frequency (CIS + TRANS)

allele_frequency <- function(df) {
  by_allele <- df %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Cis_Counts   = sum(count[cis_trans == "CIS"]),
      Trans_Counts = sum(count[cis_trans == "TRANS"]),
      Allele_Total = Cis_Counts + Trans_Counts,
      .groups = "drop"
    )

  totals <- by_allele %>%
    group_by(batch, time_point, DSB) %>%
    summarise(
      Total_All = sum(Allele_Total),
      .groups = "drop"
    )

  by_allele %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(Allele_Frequency = dplyr::if_else(Total_All > 0, Allele_Total / Total_All, NA_real_))
}

dat_allele_freq <- allele_frequency(filtered)
dat_allele_freq |> head(10) |> knitr::kable(caption = "Allele frequencies")
Allele frequencies
batch time_point DSB allele Cis_Counts Trans_Counts Allele_Total Total_All Allele_Frequency
batch6 0 DSB1 CHRIII_L02 52550 0 52550 1238149 0.0424424
batch6 0 DSB1 CHRIII_L03 39051 0 39051 1238149 0.0315398
batch6 0 DSB1 CHRIII_L04 234459 0 234459 1238149 0.1893625
batch6 0 DSB1 CHRIV_L01 50897 0 50897 1238149 0.0411073
batch6 0 DSB1 CHRIV_L03 33625 0 33625 1238149 0.0271575
batch6 0 DSB1 CHRIV_L06_2 55461 0 55461 1238149 0.0447935
batch6 0 DSB1 CHRIV_L10_2 40604 0 40604 1238149 0.0327941
batch6 0 DSB1 CHRIV_L12_2 45904 0 45904 1238149 0.0370747
batch6 0 DSB1 CHRIV_L13 30438 0 30438 1238149 0.0245835
batch6 0 DSB1 CHRIV_L15_2 52669 0 52669 1238149 0.0425385

  1. Fold change calculations (time_cmp / time_ref)

fold_change_table <- function(df_norm, value_col, time_ref, time_cmp) {
  df_norm %>%
    filter(time_point %in% c(time_ref, time_cmp)) %>%
    select(batch, time_point, DSB, allele, !!rlang::sym(value_col)) %>%
    pivot_wider(
      names_from = time_point,
      values_from = !!rlang::sym(value_col),
      values_fill = 0
    ) %>%
    mutate(
      FoldChange = fc_ratio(.data[[as.character(time_cmp)]], .data[[as.character(time_ref)]]),
      Log2FC = log2(FoldChange)
    )
}

# Fold change based on allele frequency (CIS + TRANS combined).
dat_fc_allele_freq <- fold_change_table(dat_allele_freq, "Allele_Frequency", time_ref, time_cmp) %>%
  rename(
    FoldChange_AlleleFreq_vs_ref = FoldChange,
    Log2FC_AlleleFreq_vs_ref = Log2FC
  )

dat_fc_cis <- fold_change_table(dat_norm, "Percent_Cis_of_GroupTotal", time_ref, time_cmp) %>%
  rename(
    FoldChange_Cis_vs_ref = FoldChange,
    Log2FC_Cis_vs_ref = Log2FC
  )

dat_fc_trans <- fold_change_table(dat_norm, "Percent_Trans_of_GroupTotal", time_ref, time_cmp) %>%
  rename(
    FoldChange_Trans_vs_ref = FoldChange,
    Log2FC_Trans_vs_ref = Log2FC
  )

dat_fc_allele_freq %>%
  mutate(
    FoldChange_AlleleFreq_vs_ref = round(FoldChange_AlleleFreq_vs_ref, 6),
    Log2FC_AlleleFreq_vs_ref = round(Log2FC_AlleleFreq_vs_ref, 6)
  ) %>%
  head(10) %>%
  knitr::kable(caption = "Fold change (time_cmp/time_ref) based on allele frequency")
Fold change (time_cmp/time_ref) based on allele frequency
batch DSB allele 0 180 FoldChange_AlleleFreq_vs_ref Log2FC_AlleleFreq_vs_ref
batch6 DSB1 CHRIII_L02 0.0424424 0.0147833 0.348329 -1.521478
batch6 DSB1 CHRIII_L03 0.0315398 0.0366422 1.161772 0.216327
batch6 DSB1 CHRIII_L04 0.1893625 0.1869349 0.987180 -0.018614
batch6 DSB1 CHRIV_L01 0.0411073 0.0374831 0.911838 -0.133150
batch6 DSB1 CHRIV_L03 0.0271575 0.0100328 0.369452 -1.436539
batch6 DSB1 CHRIV_L06_2 0.0447935 0.0453636 1.012728 0.018247
batch6 DSB1 CHRIV_L10_2 0.0327941 0.0336897 1.027309 0.038870
batch6 DSB1 CHRIV_L12_2 0.0370747 0.0496959 1.340418 0.422683
batch6 DSB1 CHRIV_L13 0.0245835 0.0267431 1.087844 0.121472
batch6 DSB1 CHRIV_L15_2 0.0425385 0.0526704 1.238176 0.308217
# Make the later plots larger/more readable.
# (This updates defaults for subsequent chunks only.)
knitr::opts_chunk$set(
  fig.width = 16,
  fig.height = 9,
  out.width = "100%",
  dpi = 160
)

  1. CIS and TRANS bar plots

plot_cis_percent_by_batch <- function(dat_norm) {
  df <- dat_norm %>%
    filter(!is.na(Percent_Cis_of_GroupTotal)) %>%
    mutate(
      time_point = factor(time_point, levels = sort(unique(time_point)))
    )

  batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_b <- df %>% filter(batch == b)

    dsbs <- df_b %>%
      distinct(DSB) %>%
      arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
      pull(DSB)

    for (d in dsbs) {
      df_bd <- df_b %>% filter(DSB == d)

      if (nrow(df_bd) == 0 || all(is.na(df_bd$Percent_Cis_of_GroupTotal))) next

      p <- ggplot(df_bd, aes(x = allele, y = Percent_Cis_of_GroupTotal)) +
        geom_col(width = 0.85, fill = "#2C7FB8") +
        geom_text(
          aes(label = if_else(is.na(Percent_Cis_of_GroupTotal), NA_character_, paste0(round(Percent_Cis_of_GroupTotal, 1), "%"))),
          vjust = -0.25,
          size = 3.0,
          angle = 90
        ) +
        facet_wrap(~ time_point, scales = "free_x") +
        scale_y_continuous(limits = c(0, 100)) +
        theme_allele_x() +
        labs(
          title = paste0("CIS% by allele (cis_counts / total_counts) | Batch: ", b, " | ", d),
          x = "Allele",
          y = "CIS% of total counts (within group)"
        )

      print(p)
    }
  }
}

plot_cis_percent_by_batch(dat_norm)

plot_trans_percent_by_batch <- function(dat_norm) {
  df <- dat_norm %>%
    filter(!is.na(Percent_Trans_of_GroupTotal)) %>%
    mutate(
      time_point = factor(time_point, levels = sort(unique(time_point)))
    )

  batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_b <- df %>% filter(batch == b)

    dsbs <- df_b %>%
      distinct(DSB) %>%
      arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
      pull(DSB)

    for (d in dsbs) {
      df_bd <- df_b %>% filter(DSB == d)

      if (nrow(df_bd) == 0 || all(is.na(df_bd$Percent_Trans_of_GroupTotal))) next

      p <- ggplot(df_bd, aes(x = allele, y = Percent_Trans_of_GroupTotal)) +
        geom_col(width = 0.85, fill = "#F03B20") +
        geom_text(
          aes(label = if_else(is.na(Percent_Trans_of_GroupTotal), NA_character_, paste0(round(Percent_Trans_of_GroupTotal, 1), "%"))),
          vjust = -0.25,
          size = 3.0,
          angle = 90
        ) +
        facet_wrap(~ time_point, scales = "free_x") +
        scale_y_continuous(limits = c(0, 100)) +
        theme_allele_x() +
        labs(
          title = paste0("TRANS% by allele (trans_counts / total_counts) | Batch: ", b, " | ", d),
          x = "Allele",
          y = "TRANS% of total counts (within group)"
        )

      print(p)
    }
  }
}

plot_trans_percent_by_batch(dat_norm)

# Optional: within-TRANS combo breakdowns with T0/T120 on the SAME graph.
# (Bars are colored by time.)
plot_trans_combo_by_time <- function(dat, combo_name, time_levels = c(0, 120)) {
  df_counts <- dat %>%
    filter(combo == combo_name, time_point %in% time_levels) %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(Combo_Counts = sum(count), .groups = "drop")

  if (nrow(df_counts) == 0) return(NULL)

  totals <- df_counts %>%
    group_by(batch, time_point, DSB) %>%
    summarise(Total_Combo = sum(Combo_Counts), .groups = "drop")

  df_plot <- df_counts %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(
      Percent_in_Combo = pct(Combo_Counts, Total_Combo),
      time_point = factor(time_point, levels = time_levels, labels = paste0("T", time_levels))
    )

  if (nrow(df_plot) == 0 || all(is.na(df_plot$Percent_in_Combo))) return(NULL)

  ggplot(df_plot, aes(x = allele, y = Percent_in_Combo, fill = time_point)) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(Percent_in_Combo), NA_character_, paste0(round(Percent_in_Combo, 1), "%"))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 2.0,
        angle = 90
      ) +
      facet_grid(batch + DSB ~ ., scales = "free_x") +
      scale_y_continuous(limits = c(0, 100)) +
      theme_allele_x() +
      labs(
        title = paste0("Percent of ", combo_name, " counts by allele (T0 vs T120)"),
        x = "Allele",
        y = "% within combo (within group)",
        fill = "Time"
      )
}

for (cmb in trans_combos_to_plot) {
  p <- plot_trans_combo_by_time(filtered, cmb, time_levels = c(0, 120))
  if (!is.null(p)) print(p)
}


  1. Fold change and allele frequency plots

plot_foldchange_by_batch <- function(df, fc_col, title_prefix) {
  df <- df %>% mutate(DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")))
  batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)

  for (b in batches) {
    df_b <- df %>% filter(batch == b)

    dsbs <- df_b %>%
      distinct(DSB) %>%
      arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
      pull(DSB)

    for (d in dsbs) {
      df_bd <- df_b %>% filter(DSB == d)

      if (nrow(df_bd) == 0 || all(is.na(df_bd[[fc_col]]))) next

      p <- ggplot(df_bd, aes(x = allele, y = .data[[fc_col]])) +
        geom_col(width = 0.85, fill = "#636363") +
        geom_hline(yintercept = 2, linetype = "dotted", color = "red", size = 0.7) +
        geom_text(
          aes(label = if_else(is.na(.data[[fc_col]]), NA_character_, sprintf("%.4f", .data[[fc_col]]))),
          vjust = -0.25,
          size = 2.8,
          angle = 90
        ) +
        theme_allele_x() +
        labs(
          title = paste0(title_prefix, " | Batch: ", b, " | ", d),
          x = "Allele",
          y = "Fold change"
        )
      print(p)
    }
  }
}

plot_allele_frequency_by_batch <- function(dat_allele_freq) {
  df <- dat_allele_freq %>% mutate(DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")))
  batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)

  for (b in batches) {
    df_b <- df %>% filter(batch == b)

    dsbs <- df_b %>%
      distinct(DSB) %>%
      arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
      pull(DSB)

    for (d in dsbs) {
      df_bd <- df_b %>% filter(DSB == d)

      if (nrow(df_bd) == 0 || all(is.na(df_bd$Allele_Frequency))) next

      p <- ggplot(df_bd, aes(x = allele, y = Allele_Frequency)) +
        geom_col(width = 0.85, fill = "#31A354") +
        geom_text(
          aes(label = if_else(is.na(Allele_Frequency), NA_character_, sprintf("%.3f", Allele_Frequency))),
          vjust = -0.25,
          size = 2.8,
          angle = 90
        ) +
        facet_wrap(~ time_point, scales = "free_x") +
        theme_allele_x() +
        labs(
          title = paste0("Allele Frequency (CIS + TRANS) | Batch: ", b, " | ", d),
          x = "Allele",
          y = "Allele Frequency"
        )
      print(p)
    }
  }
}

# Generate updated plots
plot_foldchange_by_batch(
  dat_fc_cis,
  "FoldChange_Cis_vs_ref",
  paste0("Fold change (", time_cmp, "/", time_ref, ") CIS by allele")
)

plot_foldchange_by_batch(
  dat_fc_trans,
  "FoldChange_Trans_vs_ref",
  paste0("Fold change (", time_cmp, "/", time_ref, ") TRANS by allele")
)

plot_foldchange_by_batch(
  dat_fc_allele_freq,
  "FoldChange_AlleleFreq_vs_ref",
  paste0("Fold change (", time_cmp, "/", time_ref, ") Allele frequency (CIS + TRANS) by allele")
)

plot_allele_frequency_by_batch(dat_allele_freq)

if (isTRUE(show_extra_fc_plots)) {
  if (nrow(dat_fc_cis) == 0 || nrow(dat_fc_trans) == 0) {
    message("Skipping extra FC plots: no data in dat_fc_cis/dat_fc_trans")
  } else {
  p1 <- ggplot(dat_fc_cis, aes(x = allele, y = FoldChange_Cis_vs_ref, fill = DSB)) +
    geom_col(width = 0.9, position = "dodge") +
    geom_hline(yintercept = 2, linetype = "dotted", color = "red", size = 0.7) +
    geom_text(aes(label = round(FoldChange_Cis_vs_ref, 2)),
              position = position_dodge(width = 0.9),
              vjust = -0.3, size = 2.5, angle = 90) +
    facet_wrap(~ batch, scales = "free_x") +
    theme_allele_x() +
    labs(
      title = paste0("Fold change (", time_cmp, "/", time_ref, ") CIS by allele"),
      x = "Allele", y = "Fold change"
    )

  p2 <- ggplot(dat_fc_trans, aes(x = allele, y = FoldChange_Trans_vs_ref, fill = DSB)) +
    geom_col(width = 0.9, position = "dodge") +
    geom_hline(yintercept = 2, linetype = "dotted", color = "red", size = 0.7) +
    geom_text(aes(label = round(FoldChange_Trans_vs_ref, 2)),
              position = position_dodge(width = 0.9),
              vjust = -0.3, size = 2.5, angle = 90) +
    facet_wrap(~ batch, scales = "free_x") +
    theme_allele_x() +
    labs(
      title = paste0("Fold change (", time_cmp, "/", time_ref, ") TRANS by allele"),
      x = "Allele", y = "Fold change"
    )

  print(p1)
  print(p2)
  }
}

  1. Correlation: log2FC vs Allele Frequency

library(ggrepel)

dat_wide <- dat_norm %>%
  filter(time_point %in% c(time_ref, time_cmp)) %>%
  select(batch, DSB, allele, time_point,
         Percent_Cis_of_GroupTotal, Percent_Trans_of_GroupTotal) %>%
  pivot_wider(
    names_from  = time_point,
    values_from = c(Percent_Cis_of_GroupTotal, Percent_Trans_of_GroupTotal),
    values_fill = 0
  ) %>%
  mutate(
    log2FC_CIS   = log2(fc_ratio(.data[[paste0("Percent_Cis_of_GroupTotal_", time_cmp)]],
                                .data[[paste0("Percent_Cis_of_GroupTotal_", time_ref)]])),
    log2FC_TRANS = log2(fc_ratio(.data[[paste0("Percent_Trans_of_GroupTotal_", time_cmp)]],
                                .data[[paste0("Percent_Trans_of_GroupTotal_", time_ref)]]))
  )

dat_fc_af <- dat_wide %>%
  inner_join(
    dat_allele_freq %>%
      filter(time_point == time_cmp) %>%
      select(batch, DSB, allele, Allele_Frequency),
    by = c("batch", "DSB", "allele")
  ) %>%
  filter(!is.na(Allele_Frequency) & Allele_Frequency > 0)

cor_summary <- dat_fc_af %>%
  group_by(batch, DSB) %>%
  summarise(
    cor_CIS_AF   = ifelse(n() >= 2, cor(log2FC_CIS, Allele_Frequency), NA),
    cor_TRANS_AF = ifelse(n() >= 2, cor(log2FC_TRANS, Allele_Frequency), NA),
    n_obs = n(),
    .groups = "drop"
  )

knitr::kable(cor_summary, caption = "Correlation: log2FC vs Allele Frequency")
Correlation: log2FC vs Allele Frequency
batch DSB cor_CIS_AF cor_TRANS_AF n_obs
batch6 DSB1 0.3887672 NA 25
batch6 DSB2 0.2250024 NA 25
batch6 TRANS NA -0.0156710 25
batch8 DSB1 0.6312123 NA 25
batch8 DSB2 -0.1127632 NA 25
batch8 TRANS NA 0.0249546 25
plot_correlation_cis <- function(dat_fc_af, batch_name, dsb_name = dsb_for_cis_correlation) {
  df_batch <- dat_fc_af %>% filter(batch == batch_name, DSB == dsb_name)
  if (nrow(df_batch) < 2) return(NULL)
  
  ggplot(df_batch, aes(x = Allele_Frequency, y = log2FC_CIS, label = allele)) +
    geom_point(size = 3, alpha = 0.8, color = "darkgreen") +
    geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
    geom_smooth(method = "lm", se = FALSE, color = "red") +
    geom_text_repel(size = 3, max.overlaps = 20) +
    theme_bw() +
    labs(
      title = paste0("Log2FC CIS vs Allele Freq | Batch: ", batch_name, " | ", dsb_name),
      x = "Allele Frequency", y = "log2FC CIS"
    )
}

plot_correlation_trans <- function(dat_fc_af, batch_name) {
  df_batch <- dat_fc_af %>% filter(batch == batch_name)
  if (nrow(df_batch) < 2) return(NULL)
  
  ggplot(df_batch, aes(x = Allele_Frequency, y = log2FC_TRANS, color = DSB, label = allele)) +
    geom_point(size = 3, alpha = 0.8) +
    geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
    geom_smooth(method = "lm", se = FALSE) +
    geom_text_repel(size = 3, max.overlaps = 20) +
    theme_bw() +
    labs(
      title = paste0("Log2FC TRANS vs Allele Freq | Batch: ", batch_name),
      x = "Allele Frequency", y = "log2FC TRANS"
    )
}

unique_batches <- unique(dat_fc_af$batch)

for (b in unique_batches) {
  p_cis   <- plot_correlation_cis(dat_fc_af, b)
  p_trans <- plot_correlation_trans(dat_fc_af, b)
  
  if (!is.null(p_cis))   print(p_cis)
  if (!is.null(p_trans)) print(p_trans)
}